In [25]:
# Import necessary libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import interp1d
import yaml
import os
import statsmodels.stats.api as sms
import scanpy as sc
import plotly.express as px

# Load the imaging data

Load imaging data¶

In [2]:
figDir = "./figures"
if not os.path.exists(figDir):
    os.makedirs(figDir)
tablesDir = "./tables"
if not os.path.exists(tablesDir):
    os.makedirs(tablesDir)
outdir = "../data/output"
# Load colors dict
with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
colorsmap = dict(zip([i["newName"] for i in iPSC_lines_map.values()],[i["color"] for i in iPSC_lines_map.values()]))
In [3]:
dataImaging=pd.read_csv('../data/imaging_data/organoidMultiplexing_growthCurves_quant.csv')

# Filter and reformat the imaging data
dataImaging = dataImaging.loc[dataImaging["FileName"].str.contains("MIX"), ["Line","Day","LineRep","EquivalentDiameterArea(microns)"]]
dataImaging["replicate"] = dataImaging["LineRep"].str.split("_",expand=True)[1].astype(int)
dataImaging = dataImaging[["Line","Day","replicate","EquivalentDiameterArea(microns)"]].rename(columns={"Line":"Mix"})
# removing incomplete mix 7
dataImaging = dataImaging[dataImaging["Mix"] != "MIX7"]
In [4]:
dataImaging
Out[4]:
Mix Day replicate EquivalentDiameterArea(microns)
2 MIX8 0 3 463.653039
3 MIX6 0 4 471.100434
5 MIX8 0 4 480.015620
7 MIX1 0 3 355.681447
11 MIX5 0 1 459.539955
... ... ... ... ...
585 MIX2 10 1 1211.001565
588 MIX1 10 1 844.396092
590 MIX5 10 1 958.028792
591 MIX4 10 3 898.788100
597 MIX5 10 4 981.861506

209 rows × 4 columns

Load census data¶

In [5]:
dataCensus=pd.read_csv('../data/census_seq/CensusSeq_data.csv')
# Filter and reformat the census data
dataCensus["Timepoint"] = dataCensus["Timepoint"].str.split(" ", expand= True)[1].astype(int)
dataCensus = dataCensus[["Sample name","DONOR","REPRESENTATION","MIX ID","Timepoint"]].rename(columns={"MIX ID":"Mix","Timepoint":"Day"})
dataCensus["Mix"] = "MIX"+dataCensus["Mix"].astype(str)
# removing incomplete mix 7
dataCensus = dataCensus[dataCensus["Mix"] != "MIX7"]
dataCensus["DONOR"] = dataCensus["DONOR"].replace({"CTL01A":"CTL01"})
dataCensus["DONOR"] = dataCensus["DONOR"].replace({"CHD2WT":"UCSFi001-A"})
dataCensus["DONOR"] = dataCensus["DONOR"].replace({"CHD8WT":"H9"})
In [6]:
dataCensus
Out[6]:
Sample name DONOR REPRESENTATION Mix Day
0 M18 CTL01 0.94993 MIX4 5
1 M18 CTL02A 0.00285 MIX4 5
2 M18 CTL05A 0.00750 MIX4 5
3 M18 CTL07C 0.01086 MIX4 5
4 M18 H1 0.00647 MIX4 5
... ... ... ... ... ...
577 M60 H9 0.30743 MIX3 25
578 M60 CTL04E 0.01336 MIX3 25
579 M60 CTL06F 0.24393 MIX3 25
580 M60 CTL08A 0.02211 MIX3 25
581 M60 CTL09A 0.40072 MIX3 25

546 rows × 5 columns

In [7]:
# Define the density volume
densityVolume = 0.000767495

# Transform diameter to cells
dataImaging["EquivalentSphereVolume"] = (np.power((dataImaging["EquivalentDiameterArea(microns)"]/2), 3))*math.pi*(4/3)
dataImaging["nCells"] = dataImaging["EquivalentSphereVolume"]*densityVolume

#%%
# Compute DeltaGrowth
GrowthRate = pd.DataFrame()
# Loop over each unique mix in the census data
for m in dataCensus.Mix.unique().tolist():
    GrowthRateMix = pd.DataFrame()
    # Filter the data for the current mix
    DataMix = dataCensus[dataCensus["Mix"] == m]
    DataMix = DataMix[DataMix["Day"].isin([5,12,-2])]
    DataMix["ClosestDay"] = DataMix["Day"].replace({5:4,12:10,-2:0})
    TPTS = np.sort(DataMix.ClosestDay.unique())
    # Loop over each unique timepoint for the current mix
    for t in TPTS:
        # Compute the average representation for each donor at the current timepoint
        DataMixTPT = DataMix[DataMix["ClosestDay"] == t].groupby("DONOR")["REPRESENTATION"].mean().to_frame().T.reset_index(drop=True)
        DataMixTPT.columns = ["AvgCensus."+i for i in DataMixTPT.columns.tolist()]
        # Compute the mean and standard deviation of the number of cells for the current mix and timepoint
        Mean = round(dataImaging.loc[(dataImaging["Day"] == t) &(dataImaging["Mix"] == m),"nCells"].mean(),2)
        std = round(dataImaging.loc[(dataImaging["Day"] == t) &(dataImaging["Mix"] == m),"nCells"].std(),2)
        # Create a dataframe with the computed values
        NcellsDF = pd.DataFrame({"Mix":m,"Day":t,"MeanCells":Mean, "STDcells":std}, index=[0])
        NcellsDF = pd.concat([NcellsDF,DataMixTPT], axis = 1)
        # Append the dataframe to the growth rate dataframe for the current mix
        GrowthRateMix = pd.concat([GrowthRateMix,NcellsDF], ignore_index=True)
    # If there are at least two timepoints for the current mix, compute the growth rate
    if GrowthRateMix.shape[0] >= 2:
        # Compute the growth rate and add it to the growth rate dataframe
        GrowthRateMix = GrowthRateMix.sort_values("Day")
        GrowThaRateVals  = GrowthRateMix.T.loc[DataMixTPT.columns]
        GrowThaRateVals = GrowThaRateVals.T.div(GrowThaRateVals.T.shift(1)).T.dropna(axis = 1)
        GrowThaRateVals.index = ["GR."+i.split(".")[1] for i in GrowThaRateVals.index]
        GrowThaRateVals.columns = ["D{}_to_D{}_growthRate".format(GrowthRateMix.loc[i,"Day"],GrowthRateMix.loc[i+1,"Day"]) for i in GrowthRateMix.sort_values("Day").index.tolist()[:-1]]
        GrowThaRateVals["Mix"] = m
        GrowThaRateVals["DONOR"] = [i.split(".")[1] for i in GrowThaRateVals.index.tolist()]
        # Add the mean and standard deviation of the number of cells and the average census to the growth rate dataframe
        for d in GrowthRateMix.Day.unique():
            GrowThaRateVals["MeanCells.D{}".format(d)] = GrowthRateMix.loc[GrowthRateMix["Day"] == d,"MeanCells"].values[0]
            GrowThaRateVals["STDcells.D{}".format(d)] = GrowthRateMix.loc[GrowthRateMix["Day"] == d,"STDcells"].values[0]
            AvgCensus = GrowthRateMix.loc[GrowthRateMix["Day"] == d, [i for i in GrowthRateMix.columns.tolist() if i.startswith("Avg")]].T
            AvgCensus.index = [i.split(".")[1] for i in AvgCensus.index.tolist()]
            GrowThaRateVals["AvgCensus.D{}".format(d)] = AvgCensus.loc[GrowThaRateVals.DONOR].to_numpy().flatten()
        # Append the growth rate dataframe for the current mix to the overall growth rate dataframe
        GrowthRate = pd.concat([GrowthRate,GrowThaRateVals], ignore_index=True)
In [8]:
GrowthRate
Out[8]:
D0_to_D4_growthRate D4_to_D10_growthRate Mix DONOR MeanCells.D0 STDcells.D0 AvgCensus.D0 MeanCells.D4 STDcells.D4 AvgCensus.D4 MeanCells.D10 STDcells.D10 AvgCensus.D10
0 4.172086 1.01204 MIX4 CTL01 31778.67 7785.07 0.22204 55127.37 4383.50 0.926370 312229.25 17148.21 0.937523
1 0.032907 1.397864 MIX4 CTL02A 31778.67 7785.07 0.15174 55127.37 4383.50 0.004993 312229.25 17148.21 0.006980
2 0.07715 1.580955 MIX4 CTL05A 31778.67 7785.07 0.13476 55127.37 4383.50 0.010397 312229.25 17148.21 0.016437
3 0.104393 0.996621 MIX4 CTL07C 31778.67 7785.07 0.14174 55127.37 4383.50 0.014797 312229.25 17148.21 0.014747
4 0.038165 0.624666 MIX4 H1 31778.67 7785.07 0.16359 55127.37 4383.50 0.006243 312229.25 17148.21 0.003900
5 0.199842 0.548795 MIX4 KTD8_2 31778.67 7785.07 0.18613 55127.37 4383.50 0.037197 312229.25 17148.21 0.020413
6 3.768502 0.906179 MIX5 CTL01 48595.31 8011.44 0.20628 68553.09 7615.66 0.777367 377111.59 38194.70 0.704433
7 0.133898 0.521918 MIX5 CTL04E 48595.31 8011.44 0.20558 68553.09 7615.66 0.027527 377111.59 38194.70 0.014367
8 0.137649 2.841229 MIX5 CTL05A 48595.31 8011.44 0.13239 68553.09 7615.66 0.018223 377111.59 38194.70 0.051777
9 0.551949 2.552438 MIX5 CTL06F 48595.31 8011.44 0.13279 68553.09 7615.66 0.073293 377111.59 38194.70 0.187077
10 0.039852 0.900892 MIX5 H1 48595.31 8011.44 0.16879 68553.09 7615.66 0.006727 377111.59 38194.70 0.006060
11 0.628373 0.374557 MIX5 UCSFi001-A 48595.31 8011.44 0.15416 68553.09 7615.66 0.096870 377111.59 38194.70 0.036283
12 2.942335 NaN MIX8 CTL01 41645.04 1760.62 0.30053 73605.42 3366.29 0.884260 NaN NaN NaN
13 0.021881 NaN MIX8 CTL02A 41645.04 1760.62 0.19469 73605.42 3366.29 0.004260 NaN NaN NaN
14 0.149044 NaN MIX8 CTL04E 41645.04 1760.62 0.23975 73605.42 3366.29 0.035733 NaN NaN NaN
15 0.285804 NaN MIX8 CTL08A 41645.04 1760.62 0.26503 73605.42 3366.29 0.075747 NaN NaN NaN
16 0.786518 0.757478 MIX1 CTL04E 21740.41 9982.70 0.25248 40500.52 9223.00 0.198580 326966.32 56339.81 0.150420
17 0.248389 1.639404 MIX1 CTL05A 21740.41 9982.70 0.16654 40500.52 9223.00 0.041367 326966.32 56339.81 0.067817
18 1.2327 2.985774 MIX1 CTL06F 21740.41 9982.70 0.17678 40500.52 9223.00 0.217917 326966.32 56339.81 0.650650
19 0.040518 0.506822 MIX1 H1 21740.41 9982.70 0.20501 40500.52 9223.00 0.008307 326966.32 56339.81 0.004210
20 2.680004 0.237722 MIX1 UCSFi001-A 21740.41 9982.70 0.19919 40500.52 9223.00 0.533830 326966.32 56339.81 0.126903
21 2.432402 0.382556 MIX2 CTL01 38089.55 3161.55 0.25723 96140.42 4002.24 0.625687 713617.89 196799.77 0.239360
22 0.064664 0.410425 MIX2 CTL02A 38089.55 3161.55 0.17207 96140.42 4002.24 0.011127 713617.89 196799.77 0.004567
23 0.188459 0.289028 MIX2 CTL07C 38089.55 3161.55 0.15556 96140.42 4002.24 0.029317 713617.89 196799.77 0.008473
24 0.362503 0.253184 MIX2 CTL08A 38089.55 3161.55 0.25129 96140.42 4002.24 0.091093 713617.89 196799.77 0.023063
25 1.48166 2.984457 MIX2 CTL09A 38089.55 3161.55 0.16385 96140.42 4002.24 0.242770 713617.89 196799.77 0.724537
26 0.113241 0.655055 MIX3 CTL04E 50790.89 5151.37 0.17323 90932.18 2678.76 0.019617 526151.83 20702.51 0.012850
27 0.289329 3.071937 MIX3 CTL06F 50790.89 5151.37 0.12620 90932.18 2678.76 0.036513 526151.83 20702.51 0.112167
28 0.343489 0.441506 MIX3 CTL08A 50790.89 5151.37 0.20547 90932.18 2678.76 0.070577 526151.83 20702.51 0.031160
29 1.39796 1.84376 MIX3 CTL09A 50790.89 5151.37 0.13462 90932.18 2678.76 0.188193 526151.83 20702.51 0.346983
30 3.2987 0.810998 MIX3 H9 50790.89 5151.37 0.17382 90932.18 2678.76 0.573380 526151.83 20702.51 0.465010
31 0.598539 0.284841 MIX3 UCSFi001-A 50790.89 5151.37 0.18666 90932.18 2678.76 0.111723 526151.83 20702.51 0.031823
32 3.672488 0.534164 MIX6 CTL01 45944.60 6040.60 0.11796 83073.41 4017.45 0.433207 567830.49 50718.91 0.231403
33 0.001423 13.444444 MIX6 CTL02A 45944.60 6040.60 0.08433 83073.41 4017.45 0.000120 567830.49 50718.91 0.001613
34 0.251951 0.535659 MIX6 CTL04E 45944.60 6040.60 0.11149 83073.41 4017.45 0.028090 567830.49 50718.91 0.015047
35 0.232655 1.661672 MIX6 CTL05A 45944.60 6040.60 0.08605 83073.41 4017.45 0.020020 567830.49 50718.91 0.033267
36 0.565313 2.113249 MIX6 CTL06F 45944.60 6040.60 0.08518 83073.41 4017.45 0.048153 567830.49 50718.91 0.101760
37 0.305695 0.572213 MIX6 CTL07C 45944.60 6040.60 0.08539 83073.41 4017.45 0.026103 567830.49 50718.91 0.014937
38 0.73663 0.370558 MIX6 CTL08A 45944.60 6040.60 0.13064 83073.41 4017.45 0.096233 567830.49 50718.91 0.035660
39 2.242307 2.439426 MIX6 CTL09A 45944.60 6040.60 0.09587 83073.41 4017.45 0.214970 567830.49 50718.91 0.524403
40 0.114095 0.711851 MIX6 H1 45944.60 6040.60 0.10798 83073.41 4017.45 0.012320 567830.49 50718.91 0.008770
41 1.270102 0.274423 MIX6 UCSFi001-A 45944.60 6040.60 0.09510 83073.41 4017.45 0.120787 567830.49 50718.91 0.033147

D0 to D4 growth rate stability¶

In [9]:
fig, axes = plt.subplots(1,1, figsize=(15,5), dpi=300, sharex=False,sharey=False)
sns.boxplot(data=GrowthRate, x="DONOR", y="D0_to_D4_growthRate", whis=(0, 100),palette=colorsmap)
sns.stripplot(data=GrowthRate, x="DONOR", y="D0_to_D4_growthRate", size=10, color=".3")
axes.title.set_text("Growth rate stability from D0 to D4")
axes.set_xlabel("Cell line", fontsize=20)
axes.set_ylabel("D0 to D4 growth rate", fontsize=20)
sns.despine(offset=30)
plt.show()

fig.savefig(figDir+"/GrowthRateStability.0to4.svg", format='svg', bbox_inches='tight')

D4 to D10 growth rate stability¶

In [10]:
fig, axes = plt.subplots(1,1, figsize=(15,5), dpi=300, sharex=False,sharey=False)

sns.boxplot(data=GrowthRate, x="DONOR", y="D4_to_D10_growthRate", whis=(0, 100),palette=colorsmap)
sns.stripplot(data=GrowthRate, x="DONOR", y="D4_to_D10_growthRate", size=10, color=".3")
axes.title.set_text("Growth rate stability from D4 to D10")
axes.set_xlabel("Cell line", fontsize=20)
axes.set_ylabel("D4 to D10 growth rate", fontsize=20)
sns.despine(offset=30)
plt.show()
fig.savefig(figDir+"/GrowthRateStability.4to10.svg", format='svg', bbox_inches='tight')
In [11]:
# %%

sns.histplot(GrowthRate["D4_to_D10_growthRate"])
total=GrowthRate["D4_to_D10_growthRate"].tolist()
total.extend(GrowthRate["D0_to_D4_growthRate"].tolist())
sns.histplot(total)
Out[11]:
<AxesSubplot: xlabel='D4_to_D10_growthRate', ylabel='Count'>
In [12]:
import matplotlib.pyplot as plt

# Assuming GrowthRate is a pandas DataFrame
D4_to_D10_growthRate = GrowthRate["D4_to_D10_growthRate"]
D0_to_D4_growthRate = GrowthRate["D0_to_D4_growthRate"]

plt.hist([D4_to_D10_growthRate, D0_to_D4_growthRate,total], label=['D4_to_D10_growthRate', 'D0_to_D4_growthRate', 'Total'], stacked=False,density=True)
plt.legend(loc='upper right')
plt.show()
In [13]:
sns.set_style("whitegrid")
sns.set_style("ticks")
fig,ax = plt.subplots(ncols=1,nrows=1, figsize=(7, 5),dpi=100 )
sns.despine(fig=ax.figure)
sns.histplot(total, stat='probability', ax = ax)
# Assume 'ax' is the axes object containing the histogram
ax = plt.gca()

# Get the rectangles representing the bars
patches = ax.patches

# Extract the heights (frequencies) and the left edges of the rectangles
frequencies = [patch.get_height() for patch in patches]
bin_edges = [patch.get_xy()[0] for patch in patches]

# Calculate the probabilities
probabilities = frequencies
In [14]:
# %%
mix_number=10
cell_dict={}
EB_cell_number=20000
total_cycles=10000
for i in range(1,total_cycles):
    generated_data = np.random.choice(bin_edges, size=mix_number, p=probabilities)
    cell_dict[i]=EB_cell_number*generated_data

simulated_exp=pd.DataFrame(cell_dict)

simulated_exp = simulated_exp.div(simulated_exp.sum(axis=0), axis=1)>0.05
sns.histplot(simulated_exp.sum(axis=0))
Out[14]:
<AxesSubplot: ylabel='Count'>
In [15]:
min_x = 0
max_x = 6



sns.histplot(total, stat='probability')
# Assume 'ax' is the axes object containing the histogram
ax = plt.gca()

# Get the rectangles representing the bars
patches = ax.patches

# Extract the heights (frequencies) and the left edges of the rectangles
frequencies = [patch.get_height() for patch in patches]
bin_edges = [patch.get_xy()[0] for patch in patches]

# Calculate the probabilities
probabilities = frequencies

# Get density from Seaborn
x, y = sns.kdeplot(np.random.choice(bin_edges, size=10000, p=probabilities),clip=(min_x, max_x)).lines[0].get_data()

probabilities=y/sum(y)

mix_number=10
cell_dict={}
EB_cell_number=20000
total_cycles=10000
for i in range(1,total_cycles):
    generated_data = np.random.choice(x, size=mix_number, p=probabilities)
    cell_dict[i]=EB_cell_number/mix_number*generated_data

simulated_exp=pd.DataFrame(cell_dict)

simulated_exp = simulated_exp.div(simulated_exp.sum(axis=0), axis=1)>0.05


sns.set_style("whitegrid")
sns.set_style("ticks")
fig,ax = plt.subplots(ncols=1,nrows=1, figsize=(7, 5),dpi=100 )

sns.histplot(simulated_exp.sum(axis=0),binwidth=1, ax = ax)
Out[15]:
<AxesSubplot: ylabel='Count'>
In [16]:
def empirical_dist(total, min_x, max_x):
    plt.clf()
    sns.histplot(total, stat='probability')
    ax = plt.gca()
    patches = ax.patches
    frequencies = [patch.get_height() for patch in patches]
    bin_edges = [patch.get_xy()[0] for patch in patches]
    x, y = sns.kdeplot(np.random.choice(bin_edges, size=10000, p=frequencies),clip=(min_x, max_x)).lines[0].get_data()
    probabilities=y/sum(y)
    return x, probabilities

cell_dict={}
median_dict={}


min_x = 0
max_x = 9

mix_number=2
max_mix=20
EB_cell_number=20000
total_cycles=10000
nFinalCells = 100000
minCellsPerCelltype = 100
RarestCelltypeRate = 0.05
#what is the minimum number of cells per genotype given RarestCelltypeRate and  minCellsPerCelltype required?
# RarestCelltypeRate(x100) : 100 = minCellsPerCelltype : X
MinCellsPerGenotype = (100*minCellsPerCelltype)/(RarestCelltypeRate*100)
MinCellsPerGenotype_Rate = MinCellsPerGenotype/nFinalCells



x, probabilities = empirical_dist(GrowthRate["D0_to_D4_growthRate"].tolist(), min_x, max_x)
x1, probabilities1 = empirical_dist(GrowthRate["D4_to_D10_growthRate"].tolist(), min_x, max_x)

k=0
while mix_number <= max_mix:
    cell_dict[mix_number]={}
    for i in range(1,total_cycles):
        generated_data = np.random.choice(x, size=mix_number, p=probabilities)
        generated_data_2 = np.random.choice(x1, size=mix_number, p=probabilities1)
        if sum(generated_data)>mix_number:
            k+=1
            generated_cell=(EB_cell_number/mix_number)*generated_data
            # if sum(generated_cell*generated_data_2)>sum(generated_cell):
            #     cell_dict[mix_number][k]=generated_cell*generated_data_2
            cell_dict[mix_number][k]=generated_cell
    

    simulated_exp=pd.DataFrame(cell_dict[mix_number])


    plt.clf()
    plot_data=(simulated_exp.div(simulated_exp.sum(axis=0), axis=1)>MinCellsPerGenotype_Rate).sum(axis=0)
    testplot=sns.histplot(plot_data,discrete=True,stat='probability')
    testplot.set(xlabel='Number of analysable genotypes', ylabel='Frequency')
    testplot.set(xticks=range(0,15,1))
    plt.title('Number of genotypes mixed: '+str(mix_number))
    # plt.show()
    plt.savefig(figDir+'/histogram_dual_'+str(mix_number)+'.png')
    
    median_dict[mix_number]=[plot_data.mean()-plot_data.std(),plot_data.mean(),plot_data.mean()+plot_data.std()]
    mix_number+=1
    print(plot_data.std())

plt.clf()



data_samplings=pd.DataFrame(median_dict).T
data_samplings['x']=data_samplings.index






# PLot empirical distribution
sns.set_style("whitegrid")
sns.set_style("ticks")
fig,ax = plt.subplots(ncols=1,nrows=1, figsize=(7, 5),dpi=100 )

sns.set_style("white")
#plt.plot(data_samplings["x"], data_samplings[1], '-', linewidth = 3, label = "Median")
sns.lineplot(data=data_samplings,x="x",y=1,color="#FDA63A",
                 linewidth=3, ax = ax)
sns.lineplot(data=data_samplings,x="x",y=0,color="blue",
                 linewidth=1.5, ax = ax)
sns.lineplot(data=data_samplings,x="x",y=2,color="blue",
                 linewidth=1.5, ax = ax)
plt.fill_between(data_samplings["x"], data_samplings[0], data_samplings[2] , 
                 alpha=0.2, color='blue',linewidth=0.0)
#plt.xlabel('pseudotime', size=30)

ax.set_ylim([0,
             round(data_samplings[2].max()+1) ])

ax.set_xlim([0, data_samplings["x"].max()])

ax.set_xticks([0, 2,5,10,20])

ax.set_xlabel('Mixed cell lines') # Y label
ax.set_ylabel('Recovered cell lines') # Y label

sns.despine(fig=ax.figure)
fig.savefig(figDir+"/MinGenotypeRate0.05.Simulation.svg", format='svg', bbox_inches='tight')
0.35022365621497326
0.5113858785630576
0.6360796124255776
0.7587491990292478
0.861063352678361
0.9588591510514366
1.0355798223004864
1.112408498986447
1.1675997689414286
1.2417317661082883
1.2796660181449084
1.3421836974746213
1.379552503611448
1.4277258582445909
1.4696310161549155
1.5129859381928021
1.5437397379132072
1.5503650190953209
1.6061752540136054
<Figure size 432x288 with 0 Axes>

Assess main cell types abundances¶

In [17]:
adataComp = sc.read(outdir+"/adatas/adataPaga.h5ad")
In [18]:
leidenOrder=["ProliferatingProgenitors","RadialGliaProgenitors","OuterRadialGliaAstrocytes","CajalR_like","Neurons","MigratingNeurons","GlutamatergicNeurons_early","GlutamatergicNeurons_late","Interneurons_GAD2","Interneurons"]

for paradigm in adataComp.obs.type.unique():
    adataCompStage = adataComp[adataComp.obs["type"] == paradigm]
    compositions = pd.DataFrame(adataCompStage.obs.groupby(["leidenAnnotated","stage"]).size())
    compositions = compositions.reset_index().rename(columns={0:"number_of_cells"})
    compositions["cells_fraction"] = compositions["number_of_cells"] / np.array(compositions.groupby("stage")["number_of_cells"].sum()[compositions["stage"].tolist()])
    compositions["cells_fraction"] = round(compositions["cells_fraction"],2)
    fig = px.bar(compositions, x="stage", y="cells_fraction", color="leidenAnnotated", hover_data=compositions, text="cells_fraction",
                 category_orders={"stage":['early', 'mid', 'late'],
                                 "leidenAnnotated":leidenOrder}, height=800,width=1000, template="plotly_white",title=paradigm,
                 color_discrete_map = dict(zip(adataCompStage.obs["leidenAnnotated"].cat.categories,adataCompStage.uns["leidenAnnotated_colors"]))
    )

    fig.update_traces(textangle=0, marker_line_color='rgb(8,48,107)', textposition = 'inside',
                    marker_line_width=1.5, opacity=1)



    fig.update_layout(font=dict(size=25, color='black'),
        yaxis = dict(tickfont = dict(size=30)),
        xaxis = dict(tickfont = dict(size=30)))

    fig.show()

~ 10 % rarest cell type should be enough to capture main cell types¶

In [88]:
cell_dict={}
median_dict={}


min_x = 0
max_x = 9

mix_number=2
max_mix=20
EB_cell_number=20000
total_cycles=100000
nFinalCells = 100000
minCellsPerCelltype = 100
RarestCelltypeRate = 0.09
#what is the minimum number of cells per genotype given RarestCelltypeRate and  minCellsPerCelltype required?
# RarestCelltypeRate(x100) : 100 = minCellsPerCelltype : X
MinCellsPerGenotype = (100*minCellsPerCelltype)/(RarestCelltypeRate*100)
MinCellsPerGenotype_Rate = MinCellsPerGenotype/nFinalCells
print(MinCellsPerGenotype_Rate)


x, probabilities = empirical_dist(GrowthRate["D0_to_D4_growthRate"].tolist(), min_x, max_x)
x1, probabilities1 = empirical_dist(GrowthRate["D4_to_D10_growthRate"].tolist(), min_x, max_x)

k=0
while mix_number <= max_mix:
    cell_dict[mix_number]={}
    for i in range(1,total_cycles):
        generated_data = np.random.choice(x, size=mix_number, p=probabilities)
        generated_data_2 = np.random.choice(x1, size=mix_number, p=probabilities1)
        if sum(generated_data)>mix_number:
            k+=1
            generated_cell=(EB_cell_number/mix_number)*generated_data
            # if sum(generated_cell*generated_data_2)>sum(generated_cell):
            #     cell_dict[mix_number][k]=generated_cell*generated_data_2
            cell_dict[mix_number][k]=generated_cell
    

    simulated_exp=pd.DataFrame(cell_dict[mix_number])


    plt.clf()
    plot_data=(simulated_exp.div(simulated_exp.sum(axis=0), axis=1)>MinCellsPerGenotype_Rate).sum(axis=0)
    testplot=sns.histplot(plot_data,discrete=True,stat='probability')
    testplot.set(xlabel='Number of analysable genotypes', ylabel='Frequency')
    testplot.set(xticks=range(0,15,1))
    plt.title('Number of genotypes mixed: '+str(mix_number))
    # plt.show()
    
    median_dict[mix_number]=[plot_data.mean()-plot_data.std(),plot_data.mean(),
                             plot_data.mean()+plot_data.std(),
                             sms.DescrStatsW(plot_data).tconfint_mean(alpha=0.05)[0],
                             sms.DescrStatsW(plot_data).tconfint_mean(alpha=0.05)[1] ]
    #median_dict[mix_number]=[plot_data.mean()-plot_data.std(),plot_data.mean(),plot_data.mean()+plot_data.std()]
    mix_number+=1
    print(plot_data.std())

plt.clf()



data_samplings=pd.DataFrame(median_dict).T
data_samplings['x']=data_samplings.index
data_samplings.columns = ["-1std","mean","+1std","95CI_Low","95CI_Up","x"]
data_samplings["-1std"] = data_samplings["-1std"].astype(float)
data_samplings["+1std"] = data_samplings["+1std"].astype(float)
data_samplings["mean"] = data_samplings["mean"].astype(float)

#%%
# PLot empirical distribution
sns.set_style("whitegrid")
sns.set_style("ticks")
fig,ax = plt.subplots(ncols=1,nrows=1, figsize=(7, 5),dpi=100 )

sns.set_style("white")
#plt.plot(data_samplings["x"], data_samplings[1], '-', linewidth = 3, label = "Median")
sns.lineplot(data=data_samplings,x="x",y="mean",color="#FDA63A",
                 linewidth=1, ax = ax,zorder=200)
sns.lineplot(data=data_samplings,x="x",y="-1std",color="blue",
                 linewidth=1.5, ax = ax)
sns.lineplot(data=data_samplings,x="x",y="+1std",color="blue",
                 linewidth=1.5, ax = ax)
plt.fill_between(data_samplings["x"], data_samplings["-1std"], data_samplings["+1std"] , 
                 alpha=0.2, color='blue',linewidth=0.0)


ax.set_ylim([0,
             round(data_samplings["+1std"].max()+1) ])

ax.set_xlim([0, data_samplings["x"].max()])

ax.set_xticks([0, 2,5,10,20])

ax.set_xlabel('Mixed cell lines') # Y label
ax.set_ylabel('Recovered cell lines') # Y label

sns.despine(fig=ax.figure)
fig.savefig(figDir+"/MinGenotypeRate.Simulation.RarestCT:{}.svg".format(MinCellsPerGenotype_Rate), format='svg', bbox_inches='tight')
0.011111111111111112
0.2849344287427844
0.4208638181481634
0.5387833043648257
0.6584005055262023
0.7726536343098065
0.8731419499739559
0.966850234873297
1.051021984167527
1.12639100040836
1.1969438566333812
1.2664553836042336
1.3280871262657166
1.3813775188204362
1.4392394349617124
1.4920005681881165
1.5348624143887402
1.584403817770791
1.6253521422928991
1.666051530216503
<Figure size 360x504 with 0 Axes>
In [91]:
SamplingsTable = data_median[["mean","+1std","95CI_Low","95CI_Up","x"]].rename(columns={"x":"MixedLines","mean":"meanRecoveredLines"})
SamplingsTable["sd"] = SamplingsTable["+1std"] - SamplingsTable["meanRecoveredLines"]
SamplingsTable = SamplingsTable.drop(columns=["+1std"])
SamplingsTable.to_excel(tablesDir+"/MinGenotypeRate.Simulation.RarestCT:{}.xlsx".format(round(MinCellsPerGenotype_Rate,2)), index=False)
In [21]:
from scipy.optimize import curve_fit

# let's fit a power function to the data

def powerFun(x, a, b, c):
    y = a * np.power(x, b) + c
    return y

continous_cuntSpace = np.linspace(min(data_samplings['x']), max(data_samplings['x']), 100)

FittedValues = pd.DataFrame(index = continous_cuntSpace)


fittedParams = curve_fit(powerFun, data_samplings.index.tolist(),data_samplings[1])[0]
FittedValues["Fitted_log_RarestCelltypeRate:{}".format(RarestCelltypeRate)] = powerFun(continous_cuntSpace, *fittedParams)
#sns.lineplot(continous_cuntSpace, powerFun(continous_cuntSpace, *fittedParams), label="powerFun")
fittedY = powerFun(np.array([2,5,10,20]), *fittedParams)
print(fittedY)







from matplotlib import pylab

from matplotlib.pyplot import figure
sns.set_style("whitegrid")
pylab.rcParams['figure.figsize'] = (5, 7)



FittedValues["MixedGenotypes"] = FittedValues.index.tolist()
plot = FittedValues.melt("MixedGenotypes", value_name="Recovered Genotypes").copy()
plot = plot[plot["variable"] == "Fitted_log_RarestCelltypeRate:{}".format(RarestCelltypeRate)]

sns.set_style("ticks")
fig,ax = plt.subplots(ncols=1,nrows=1, figsize=(7, 5),dpi=100 )
sns.lineplot(data=plot,x="MixedGenotypes",y="Recovered Genotypes",hue="variable",palette="Set2",
                 linewidth=3, ax = ax)

#ax.get_legend().remove()# To remove the legend

# ax.set_ylabel('SHH expressing cells', fontsize = 30) # Y label
# ax.set_xlabel('genotype', fontsize = 30) # X label
ax.set_ylim([0,
             round(data_samplings[2].max()+1) ])

ax.set_xlim([0, (max_mix+1)])


#ax.set_yticks([0,.25,.5,.75,1])
fig.suptitle('')
ax.set_xlabel('Mixed cell lines') # Y label
ax.set_ylabel('Recovered cell lines') # Y label
ax.set_xticks([0, 2,5,10,20])



### Pick the vertical lines to plot
vlinelist = [5,10,20]
#fittedParams = curve_fit(powerFun, finalDF.index.tolist(),finalDF["Fitted_log_RarestCelltypeRate:{}".format(RarestCelltypeRate)])[0]
predRecover = powerFun(np.array(vlinelist), *fittedParams)
vlineDict = dict(zip(vlinelist,predRecover))
vlineDictDF = pd.DataFrame(vlineDict, index=["Recovered"]).T
vlineDictDF["mixedLines"] = vlineDictDF.index

for vl in list(vlineDict.keys()):
    plt.vlines(x=vl,ymax=vlineDict[vl],ymin=0, color='#e6e6e6',linestyle='--')
    plt.hlines(y=vlineDict[vl],xmax=vl,xmin=0, color='magenta',linestyle='-',linewidth = .1 )

ax.set_title('Number of recovered cell lines with more than {}% representation'.format(round(MinCellsPerGenotype_Rate*100)))


sns.scatterplot(data = vlineDictDF, x = "mixedLines", y = "Recovered", color = "magenta", 
                s = 50,ax = ax, zorder=100)
ax.get_legend().remove() 

sns.despine(fig=ax.figure)


fig.savefig(figDir+"/MinGenotypeRate.Fitted.RarestCT:{}.svg".format(RarestCelltypeRate), format='svg', bbox_inches='tight')
[ 1.83116391  4.48099458  7.77981212 12.8719096 ]